library(tidyverse)
library(ggraph)
library(tidygraph)
library(igraph)
library(data.table)
library(tidytable)
library(statnet)

knitr::opts_chunk$set(message=FALSE, warning=FALSE)
gkl_actors = fread('gkl_actors_full.csv')

all_actors = fread('../estc-data-unified/estc-actors-unified/actors.tsv')

Bipartite graph, projected to actor links:

g = gkl_actors %>% filter(J_divergence<.42) %>% 
  filter(!is.na(actor_id)) %>% 
  group_by(estc_id, actor_id) %>% 
  summarise(diverg = mean(J_divergence)) %>% 
  graph_from_data_frame( directed=FALSE)

V(g)$type <- bipartite_mapping(g)$type 

#gg <- unipartite_projection_attr(gkl_actors_graph, "diverg", FALSE)

gg = bipartite.projection(g)

Basic node-level stats:

Weighted degree of all in graph with < .42 threshold:

gg[[2]] %>%
  as_tbl_graph() %>% 
  activate(edges)  %>% 
  activate(nodes) %>% 
  mutate(wtd_degree = centrality_degree(weights = weight)) %>% 
  as_tibble() %>% 
  arrange(desc(wtd_degree)) %>%
  rename(actor_id = name) %>% 
  left_join(all_actors %>% 
              select(actor_id, name_unified, viaf_link), by = 'actor_id') 

Degree distribution:

gg[[2]] %>%
  as_tbl_graph() %>% 
  activate(edges)  %>% 
  activate(nodes) %>% 
  mutate(wtd_degree = centrality_degree(weights = weight)) %>% 
  as_tibble() %>%
  count(wtd_degree) %>%
  ggplot() + geom_point(aes(wtd_degree,n)) + scale_x_log10()+ scale_y_log10()

Not normally distributed but not scale-free (on a power law) either.

Betweenness centrality (not weighted):

(The sum of all the shortest paths between every pair of nodes which pass through that node)

gg[[2]] %>%
  as_tbl_graph() %>% 
  activate(edges)  %>% 
  activate(nodes) %>% 
  mutate(betweenness = centrality_betweenness()) %>% 
  as_tibble() %>% 
  arrange(desc(betweenness)) %>%
  rename(actor_id = name) %>% 
  left_join(all_actors %>% 
              select(actor_id, name_unified, viaf_link), by = 'actor_id') 

Degree and Betweenness

These two metrics are often related:

total_pub_counts = gkl_actors %>% filter(J_divergence<.42) %>% count(actor_id) %>% filter(!is.na(actor_id))

gg[[2]] %>%
  as_tbl_graph() %>% 
  activate(edges)  %>% 
  activate(nodes) %>% 
  mutate(betweenness = centrality_betweenness()) %>% 
  mutate(degree = centrality_degree(weights = weight)) %>% 
  as_tibble() %>% 
  mutate(betweenness_rank = rank(-betweenness))%>% 
  mutate(degree_rank = rank(-degree)) %>% 
  left_join(total_pub_counts, by = c('name' = 'actor_id')) %>% 
  ggplot(aes(betweenness_rank, degree_rank, size = n))   + 
  geom_rect(aes(xmin = 0, xmax = 100, ymin = 50, ymax = 500), fill = 'red', color = 'black',size = .1, pch = 21, alpha = .1) +
  geom_point(alpha = .9, pch = 21)+ scale_size_area() + theme_bw()

Plotting both and looking for outliers shows nodes with ‘surprisingly’ high betweenness rank considering their degree (looking in the highlighted area for a start) can find interesting ‘bridges’, not important in their own right but holding separate parts of the network together:

p = gg[[2]] %>%
  as_tbl_graph() %>% 
  activate(edges)  %>% 
  activate(nodes) %>% 
  mutate(betweenness = centrality_betweenness()) %>% 
  mutate(degree = centrality_degree(weights = weight)) %>% 
  as_tibble()%>%
  rename(actor_id = name) %>% 
  left_join(all_actors %>% 
              select(actor_id, name_unified, viaf_link), by = 'actor_id')  %>% 
  mutate(betweenness_rank = rank(-betweenness))%>% 
  mutate(degree_rank = rank(-degree))  %>% 
  left_join(total_pub_counts, by = c('actor_id' = 'actor_id')) %>% 
  ggplot(aes(betweenness_rank, degree_rank, size = n, text = name_unified)) + 
  geom_point(alpha = .5) + scale_size_area()


plotly::ggplotly(p)

For instance David Niven (16th for betweenness, 425th for degree)

niven_books = gkl_actors %>% filter(actor_id == 'davidniven_0') %>% pull(estc_id)

gkl_actors %>% 
  filter(estc_id %in% niven_books & J_divergence <.42) 

Who did they work with?

g = gkl_actors %>% filter(J_divergence<.42) %>% 
  filter(!is.na(actor_id)) %>% 
  group_by(estc_id, actor_id) %>% 
  summarise(diverg = mean(J_divergence)) %>% 
  graph_from_data_frame( directed=FALSE)

V(g)$type <- bipartite_mapping(g)$type 

gg = bipartite.projection(g)

actor_net = gg[[2]]

names(neighbors(actor_net, 'davidniven_0')) %>% as_tibble()%>% 
  left_join(all_actors %>% 
              select(actor_id, name_unified, viaf_link), by = c('value' = 'actor_id'))

H.D. Symonds (19th for betweenness, 169th for degree):

symonds_books = gkl_actors %>% filter(actor_id == '62707926') %>% pull(estc_id)

gkl_actors %>% 
  filter(estc_id %in% symonds_books & J_divergence <.42) %>% 
  filter(!is.na(actor_id)) %>% 
  pull(actor_id)
##  [1] "bbti_34706"       "bbti_58231"       "62707926"         "62707926"        
##  [5] "bbti_64114"       "messrichardson_0" "bbti_57604"       "62707926"        
##  [9] "24680588"         "bbti_22175"       "62707926"         "bbti_58231"      
## [13] "16320887"         "62707926"         "bbti_5526"        "bbti_77753"      
## [17] "39231857"         "62707926"         "bbti_109535"      "peterhill_0"     
## [21] "17068965"         "62707926"         "bbti_97091"       "bbti_72631"      
## [25] "westandhughes_1"  "24680588"         "messrichardson_0" "bbti_22173"      
## [29] "62707926"         "62707926"         "bbti_45088"       "62707926"        
## [33] "bbti_5155"        "62707926"         "bbti_97091"       "bbti_34268"      
## [37] "62707926"         "bbti_34706"       "bbti_58231"       "62707926"        
## [41] "62707926"         "128307380"        "bbti_97091"       "bbti_59971"      
## [45] "bbti_109535"      "peterhill_0"      "17068965"         "62707926"        
## [49] "bbti_97091"       "bbti_72631"       "westandhughes_1"  "bbti_59971"      
## [53] "bbti_77744"       "bbti_62208"       "71631229"         "bbti_34268"      
## [57] "62707926"
names(neighbors(actor_net, '62707926')) %>% 
  as_tibble()%>% 
  left_join(all_actors %>% 
              select(actor_id, name_unified, viaf_link), by = c('value' = 'actor_id'))

J. Duncan and Son (22nd for betweenness, 397 for degree):

duncan_books = gkl_actors %>% filter(actor_id == 'jduncanandson_0') %>% pull(estc_id)

gkl_actors %>% 
  filter(estc_id %in% duncan_books & J_divergence <.42) 
names(neighbors(actor_net, 'jduncanandson_0')) %>% as_tibble() %>% 
  left_join(all_actors %>% 
              select(actor_id, name_unified, viaf_link), by = c('value' = 'actor_id'))

Eigenvector centrality:

(Scores a node’s centrality based on its connections to other important nodes). Might suggest book trade actors who were influential because of their connections, or because they ‘had the ear’ of important individuals.

actor_net %>%
  as_tbl_graph() %>% 
  mutate(eigen = centrality_eigen(weights = weight)) %>% 
  as_tibble() %>% arrange(desc(eigen))%>% 
  left_join(all_actors %>% 
              select(actor_id, name_unified, viaf_link), by = c('name' = 'actor_id'))

Some changes from highest degree e.g William Strahan is 16th highest by degree but 6th in eigenvector centrality.

Edge-level metrics:

Highest-weighted edges:

actor_net %>% as_tbl_graph() %>% activate(edges)%>% 
  mutate(to_name = .N()$name[to], 
         from_name = .N()$name[from]) %>% 
  as_tibble() %>% 
  select(from = from_name, to = to_name, weight) %>% 
  arrange(desc(weight)) %>% 
  left_join(all_actors %>% select(name_unified, actor_id), by = c('from' = 'actor_id'))%>% 
  left_join(all_actors%>% select(name_unified, actor_id), by = c('to' = 'actor_id'))

We could look at overlapping works for these pairs e.g:

c_works = gkl_actors %>% filter(J_divergence <.42& actor_id == '18758830') %>% pull(estc_id)

s_works = gkl_actors %>% filter(J_divergence <.42& actor_id == '39467138')%>% pull(estc_id)

intersect(c_works, s_works) %>% as_tibble() %>% 
  left_join(gkl_actors, by = c('value' = 'estc_id')) %>% filter(actor_id %in% c('18758830', '39467138'))

Time series approach:

Calculate degree scores for networks consisting of one year of data:

get_yearly_stats = function(df){
  
 g =  df %>% filter(J_divergence<.42) %>% 
  filter(!is.na(actor_id)) %>% 
  group_by(estc_id, actor_id) %>% 
  summarise(diverg = mean(J_divergence)) %>% 
  graph_from_data_frame( directed=FALSE)

V(g)$type <- bipartite_mapping(g)$type 

gg = bipartite.projection(g)

actor_net = gg[[2]]

stats = actor_net %>% as_tbl_graph() %>% 
  mutate(degree = centrality_degree(weights = weight)) %>% 
  mutate(between = centrality_betweenness()) %>% 
  mutate(louvain = group_louvain(weights = weight)) %>% 
  as_tibble()

stats

  
}

list_of_dfs = list()

for(i in 1700:1800){
  
  list_of_dfs[[as.character(i)]] = gkl_actors %>% 
    filter(publication_year == i)
}

results = map(list_of_dfs, 
      possibly(get_yearly_stats, otherwise = NA_character_) )

na.omit.list <- function(y) { return(y[!sapply(y, function(x) all(is.na(x)))]) }

results = rbindlist(results %>% na.omit.list, idcol = 'year')

Sum of degree scores by year:

results %>% mutate(year = as.numeric(year)) %>%
  count(year, wt = degree) %>% 
  ggplot() + geom_col(aes(year, n))

Communities:

Interactive exploratory map of communities found for all books with <.42 divergence: Labels are sized by degree score. Filtered to edges with a weight for more than 1.

g = actor_net %>% as_tbl_graph() %>% 
  mutate(louvain = group_louvain(weights = weight)) %>% 
  mutate(color = louvain)

filtered_g = g %>% 
  activate(edges) %>% filter(weight>1) %>% 
  activate(nodes) %>% 
  mutate(degree = centrality_degree(mode = 'all', weights=  weight)) %>% filter(degree>0) %>% left_join(all_actors, by = c('name' = 'actor_id')) %>% 
  mutate(actor_id = name) %>%
  mutate(name = paste0(name_unified, " (", name, ")")) %>% mutate(size =5 ) %>% 
  mutate(font.size =sqrt(degree))

visNetwork::visIgraph(filtered_g, layout = 'layout_with_kk', physics = T)%>% 
  visNetwork::visEdges(width = .01, color = list(opacity = .3))%>%
 visNetwork::visOptions(selectedBy = "louvain")

Looks like two ‘core’ communities, community 1 with Lowndes, Fauldner etc. I think some of these are generational communities - because there are father/son pairs in different communities.

To check:

comms = g %>% filter(louvain %in% 1:10) %>%as_tibble()

gkl_actors %>% left_join(comms, by = c('actor_id' = 'name')) %>% 
  count(louvain, publication_year) %>% filter(!is.na(louvain)) %>%  
  ggplot() + geom_col(aes(x = publication_year, y =n, fill = as.factor(louvain)))

Might be more meaningful to divide the communities by 10/20 years.

Not all are just temporal. For example community 3 almost complete cut off, except for connection through Luke White to John Debrett. These are Dublin BT actors:

g %>% 
  filter(louvain ==3) %>% 
  pull(name) %>% as_tibble() %>% 
  inner_join(gkl_actors, by = c('value' = 'actor_id')) %>% count(publication_place)

What is the divergence profile of books worked on by actors in these communities?

louvain_df = g %>% as_tibble() %>% select(name, louvain)

gkl_actors  %>% 
  left_join(louvain_df, by = c('actor_id'= 'name')) %>%
  filter(louvain %in% 1:15) %>% 
  ggplot() + 
  geom_density(aes(J_divergence)) + 
  facet_wrap(~louvain, ncol = 3)

What authors did they publish on?

gkl_actors  %>% filter(J_divergence<.42) %>% 
  left_join(louvain_df, by = c('actor_id'= 'name')) %>%
  filter(louvain %in% 1:10) %>% 
  count(louvain, author) %>% 
  arrange(desc(n)) %>% 
  filter(!is.na(author)) %>% 
  group_by(louvain) %>% top_n(10, wt = n) %>%
  summarise(authors = paste0(author, " (", n, ")", collapse = "; ")) 

What about the j_divergence profiles of the entire network?

all_g = gkl_actors %>% 
  filter(!is.na(actor_id)) %>% 
  group_by(estc_id, actor_id) %>% 
  summarise(diverg = mean(J_divergence)) %>% 
  graph_from_data_frame( directed=FALSE)

V(all_g)$type <- bipartite_mapping(all_g)$type 

#gg <- unipartite_projection_attr(gkl_actors_graph, "diverg", FALSE)

all_gg = bipartite.projection(all_g)
all_gg_louvain_w = all_gg[[2]] %>% 
  as_tbl_graph() %>% 
  mutate(louvain = group_louvain(weights = weight)) %>% 
  as_tibble()



gkl_actors %>% left_join(all_gg_louvain_w, by = c('actor_id' = 'name')) %>%
  filter(louvain %in% 1:15) %>% 
  ggplot() + 
  geom_histogram(aes(J_divergence), binwidth = .01) + 
  facet_wrap(~louvain, ncol = 3, scales = 'free_y')

Network graph

degree_scores = all_gg[[2]] %>% 
  as_tbl_graph() %>% 
  mutate(degree = centrality_degree(mode = 'all', weights = weight)) %>% 
  as_tibble()


edges = gkl_actors %>%
  mutate(estc_id_with_j = paste0(estc_id, " (", round(J_divergence,3), ")")) %>% select(actor_id, estc_id,estc_id_with_j)

nodes = all_gg[[2]] %>% as_tbl_graph()  %>% as_tibble() %>% mutate(id = 1:nrow(.))


multigraph_edges = all_gg[[2]] %>%
  as_tbl_graph() %>% activate(edges) %>%
  as_tibble() %>%
  left_join(nodes, by = c('from' = 'id')) %>%
  left_join(nodes, by = c('to' = 'id')) %>%
  left_join(edges, by =c('name.x' = 'actor_id'))%>%
  left_join(edges, by =c('name.y' = 'actor_id')) %>%
  filter(estc_id.x == estc_id.y) %>%
  select(from, to, name.x, name.y, estc_id.x, estc_id_with_j.x) %>% 
  left_join(gkl_actors %>% 
              distinct(estc_id, short_title), by = c('estc_id.x' = 'estc_id')) %>% 
  mutate(title = paste0(short_title, " (", estc_id_with_j.x, ")"))%>% 
  left_join(all_actors %>% 
              select(actor_id, name_unified), by = c('name.x' = 'actor_id'))%>% 
  left_join(all_actors %>% 
              select(actor_id, name_unified), by = c('name.y' = 'actor_id'))%>% 
  mutate(name.x = paste0(name_unified.x, " (", name.x, ")")) %>% 
  mutate(name.y = paste0(name_unified.y, " (", name.y, ")"))

works_shared = multigraph_edges %>% group_by(from, to, name.x, name.y) %>% 
  summarise(works = paste0(title, collapse = '; '))

filtered_g = all_gg[[2]] %>% 
  as_tbl_graph()%>% 
  activate(edges) %>% 
  left_join(works_shared, by = c('from', 'to'))  %>% 
  mutate(title = paste0("<b>Actor1: </b>", name.x,  "<br><b>Actor2: </b>", name.y, "<br><b>WORKS: </b><br>", works)) %>% 
  activate(nodes) %>% 
  mutate(louvain = group_louvain(weights = weight)) %>% 
  mutate(color = louvain) %>% 
  activate(edges) %>% 
  filter(weight>3) %>% 
  activate(nodes) %>% 
  mutate(degree = centrality_degree(weights = weight)) %>% 
  filter(degree>0) %>% mutate(size = 5) %>% filter(louvain %in% 1:30) %>% activate(edges) %>% mutate(width = sqrt(weight))
visNetwork::visIgraph(filtered_g, layout = 'layout_with_fr', physics = F)%>% 
  visNetwork::visEdges(width = .5, color = list(opacity = .7))%>%
 visNetwork::visOptions(selectedBy = "louvain")

All communities with works and actors

This table lists works and actors for each community (taken from all the gkl_actors data).

The works column shows the twenty most frequent works worked on by actors in each community. The first number in parentheses is the j_divergence score, and the second the number of times that work is found. Editions have not been combined because they will have slightly different j_divergence scores.

The actors column is the top most significant actors by degree score, which is included in parentheses.

degree_scores = all_gg[[2]] %>% 
  as_tbl_graph() %>% 
  mutate(degree = centrality_degree(mode = 'all', weights = weight)) %>% 
  as_tibble()


actors = all_gg_louvain_w %>% 
  filter(louvain %in% 1:30) %>% 
  left_join(degree_scores) %>% arrange(desc(degree)) %>% 
  left_join(all_actors, by = c('name' = 'actor_id')) %>% 
  mutate(name = paste0(name_unified, " (", name, ") (", degree, ")")) %>% 
  group_by(louvain) %>% top_n(20, degree) %>%  
  summarise(names = paste0(name, collapse = '; '))

works = all_gg_louvain_w %>% filter(louvain %in% 1:30)  %>% 
  left_join(gkl_actors, by = c('name' = 'actor_id')) %>% 
  count(louvain, estc_id, short_title) %>% 
  left_join(gkl_actors %>% distinct(estc_id, .keep_all = T) %>% select(estc_id, J_divergence))  %>% filter(J_divergence <.42)%>% 
  mutate(work_with_count = paste0(short_title, " (", n, ")", " (", round(J_divergence,4), ")"))%>% 
  arrange(-n) %>% 
  group_by(louvain) %>% 
  top_n(20, n)  %>% 
  summarise(works = paste0(unique(work_with_count), collapse = '; ')) %>% left_join(actors, by = 'louvain')%>% DT::datatable()

works

Communities in blocks of 10 years:

get_yearly_stats = function(df){
  
 g =  df %>% filter(J_divergence<.42) %>% 
  filter(!is.na(actor_id)) %>% 
  group_by(estc_id, actor_id) %>% 
  summarise(diverg = mean(J_divergence)) %>% 
  graph_from_data_frame( directed=FALSE)

V(g)$type <- bipartite_mapping(g)$type 

gg = bipartite.projection(g)

actor_net = gg[[2]]

stats = actor_net %>% as_tbl_graph() %>% 
  mutate(degree = centrality_degree(weights = weight)) %>% 
  mutate(between = centrality_betweenness()) %>% 
  mutate(louvain = group_louvain(weights = weight)) %>% 
  as_tibble()

stats

  
}

list_of_dfs_10 = list()

for(i in seq(1700, 1800, 10)){
  
  list_of_dfs_10[[as.character(i)]] = gkl_actors %>% 
    filter(publication_decade == i)
}

results = map(list_of_dfs_10, 
      possibly(get_yearly_stats, otherwise = NA_character_) )

na.omit.list <- function(y) { return(y[!sapply(y, function(x) all(is.na(x)))]) }

results = rbindlist(results %>% na.omit.list, idcol = 'year')

So for example 1780s:

gkl_actors %>% filter(publication_decade == 1750)%>% 
  inner_join(results %>% filter(year == 1750), by =c('actor_id' = 'name'))%>%
  filter(louvain %in% 1:15) %>%
  ggplot() + 
  geom_density(aes(J_divergence)) + 
  facet_wrap(~louvain, ncol = 3)

Predicting edge ties:

Use the counts of over and under-threshold books worked on as covariate to predict edge ties in the whole network:

df = gkl_actors  %>% 
  filter(!is.na(actor_id)) %>% 
    distinct(actor_id, estc_id)
  
  
  j_count = gkl_actors  %>% 
   filter(J_divergence<.42) %>% 
  filter(!is.na(actor_id)) %>% 
    count(actor_id, name = 'j_count')
  
    not_j_count = gkl_actors  %>% 
   filter(J_divergence>=.42) %>% 
  filter(!is.na(actor_id)) %>% 
    count(actor_id, name = 'not_j_count')


g = df%>%
  graph_from_data_frame( directed=FALSE)

V(g)$type <- bipartite_mapping(g)$type 

gg = bipartite.projection(g)
  
  books_net = gg[[2]] %>% 
  as_tbl_graph() %>% 
  left_join(j_count, 
            by = c('name' = 'actor_id')) %>% 
  left_join(not_j_count, 
            by = c('name' = 'actor_id')) %>% 
    mutate(j_count = ifelse(is.na(j_count), 0, j_count)) %>% 
    mutate(not_j_count = ifelse(is.na(not_j_count), 0, not_j_count))
  
  
  stats_g_net = intergraph::asNetwork(books_net)
  
  m1 <- ergm(stats_g_net ~ edges  + nodecov('j_count'))
  
  summary(m1)
## Call:
## ergm(formula = stats_g_net ~ edges + nodecov("j_count"))
## 
## Iterations:  9 out of 20 
## 
## Monte Carlo MLE Results:
##                   Estimate Std. Error MCMC % z value Pr(>|z|)    
## edges           -6.6869474  0.0058794      0 -1137.3   <1e-04 ***
## nodecov.j_count  0.0492235  0.0002061      0   238.8   <1e-04 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      Null Deviance: 31175779  on 22488571  degrees of freedom
##  Residual Deviance:   472283  on 22488569  degrees of freedom
##  
## AIC: 472287    BIC: 472317    (Smaller is better.)

Getting the estimate by decade:

results = list()

for(i in seq(1700, 1800, 10)){

df = gkl_actors  %>% filter(publication_decade== i) %>% 
  filter(!is.na(actor_id)) %>% 
    distinct(actor_id, estc_id)
  
  
  j_count = gkl_actors  %>% filter(publication_decade== i)%>% 
   filter(J_divergence<.42) %>% 
  filter(!is.na(actor_id)) %>% 
    count(actor_id, name = 'j_count')
  
    not_j_count = gkl_actors %>% filter(publication_decade== i) %>% 
   filter(J_divergence>=.42) %>% 
  filter(!is.na(actor_id)) %>% 
    count(actor_id, name = 'not_j_count')


g = df%>%
  graph_from_data_frame( directed=FALSE)

V(g)$type <- bipartite_mapping(g)$type 

gg = bipartite.projection(g)
  
  books_net = gg[[2]] %>% 
  as_tbl_graph() %>% 
  left_join(j_count, 
            by = c('name' = 'actor_id')) %>% 
  left_join(not_j_count, 
            by = c('name' = 'actor_id')) %>% 
    mutate(j_count = ifelse(is.na(j_count), 0, j_count)) %>% 
    mutate(not_j_count = ifelse(is.na(not_j_count), 0, not_j_count))
  
  
  stats_g_net = intergraph::asNetwork(books_net)
  
  m1 <- ergm(stats_g_net ~ edges  + nodecov('j_count'))
  
  r = m1$MCMCtheta %>% as_tibble()
  
  r$type = c('edges', 'j')
  
  r$year = i
  
  results[[as.character(i)]] = r
  
}

results = rbindlist(results)

results %>% 
  filter(type =='j') %>% 
  ggplot() + geom_col(aes(year, value))

What other factors influence edge ties?